The date range from 01-21-2020 to 03-23-2023. The cases range from 1 to 12169158. The deaths range from 0 to 104277. There is a positive correlation between pop_density and deathsper100k. As pop_density increases, the deathsper100k also increases.
Loading required package: ggplot2
Attaching package: 'plotly'
The following object is masked from 'package:ggplot2':
last_plot
The following object is masked from 'package:stats':
filter
The following object is masked from 'package:graphics':
layout
library (ggplot2)
library (data.table)
library (zoo)
The following objects are masked from 'package:base':
as.Date, as.Date.numeric
cv_states <- as.data.frame (read.csv ("https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv" ))
state_pops <- as.data.frame (read.csv ("https://raw.githubusercontent.com/COVID19Tracking/associated-data/master/us_census_data/us_census_2018_population_estimates_states.csv" ))
state_pops$ abb <- state_pops$ state
state_pops$ state <- state_pops$ state_name
state_pops$ state_name <- NULL
cv_states <- merge (cv_states, state_pops, by.x = "state" , by.y = "state" )
dim (cv_states)
state date fips cases deaths geo_id population pop_density abb
1 Alabama 2023-01-04 1 1587224 21263 1 4887871 96.50939 AL
2 Alabama 2020-04-25 1 6213 213 1 4887871 96.50939 AL
3 Alabama 2023-02-26 1 1638348 21400 1 4887871 96.50939 AL
4 Alabama 2022-12-03 1 1549285 21129 1 4887871 96.50939 AL
5 Alabama 2020-05-06 1 8691 343 1 4887871 96.50939 AL
6 Alabama 2021-04-21 1 524367 10807 1 4887871 96.50939 AL
state date fips cases deaths geo_id population pop_density abb
58089 Wyoming 2022-09-11 56 175290 1884 56 577737 5.950611 WY
58090 Wyoming 2022-08-21 56 173487 1871 56 577737 5.950611 WY
58091 Wyoming 2021-01-26 56 51152 596 56 577737 5.950611 WY
58092 Wyoming 2021-02-21 56 53795 662 56 577737 5.950611 WY
58093 Wyoming 2021-08-22 56 70671 809 56 577737 5.950611 WY
58094 Wyoming 2021-03-20 56 55581 693 56 577737 5.950611 WY
'data.frame': 58094 obs. of 9 variables:
$ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
$ date : chr "2023-01-04" "2020-04-25" "2023-02-26" "2022-12-03" ...
$ fips : int 1 1 1 1 1 1 1 1 1 1 ...
$ cases : int 1587224 6213 1638348 1549285 8691 524367 1321892 1088370 1153149 814025 ...
$ deaths : int 21263 213 21400 21129 343 10807 19676 16756 16826 15179 ...
$ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
$ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
$ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
$ abb : chr "AL" "AL" "AL" "AL" ...
#format the data
cv_states$ date <- as.Date (cv_states$ date, format= "%Y-%m-%d" )
state_list <- unique (cv_states$ state)
cv_states$ state <- factor (cv_states$ state, levels = state_list)
abb_list <- unique (cv_states$ abb)
cv_states$ abb <- factor (cv_states$ abb, levels = abb_list)
cv_states <- cv_states[order (cv_states$ state, cv_states$ date),]
str (cv_states)
'data.frame': 58094 obs. of 9 variables:
$ state : Factor w/ 52 levels "Alabama","Alaska",..: 1 1 1 1 1 1 1 1 1 1 ...
$ date : Date, format: "2020-03-13" "2020-03-14" ...
$ fips : int 1 1 1 1 1 1 1 1 1 1 ...
$ cases : int 6 12 23 29 39 51 78 106 131 157 ...
$ deaths : int 0 0 0 0 0 0 0 0 0 0 ...
$ geo_id : int 1 1 1 1 1 1 1 1 1 1 ...
$ population : int 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 4887871 ...
$ pop_density: num 96.5 96.5 96.5 96.5 96.5 ...
$ abb : Factor w/ 52 levels "AL","AK","AZ",..: 1 1 1 1 1 1 1 1 1 1 ...
state date fips cases deaths geo_id population pop_density abb
1029 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
597 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
282 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
12 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
266 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
78 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
state date fips cases deaths geo_id population pop_density abb
57902 Wyoming 2023-03-18 56 185640 2009 56 577737 5.950611 WY
57916 Wyoming 2023-03-19 56 185640 2009 56 577737 5.950611 WY
57647 Wyoming 2023-03-20 56 185640 2009 56 577737 5.950611 WY
57867 Wyoming 2023-03-21 56 185800 2014 56 577737 5.950611 WY
58057 Wyoming 2023-03-22 56 185800 2014 56 577737 5.950611 WY
57812 Wyoming 2023-03-23 56 185800 2014 56 577737 5.950611 WY
state date fips cases deaths geo_id population pop_density abb
1029 Alabama 2020-03-13 1 6 0 1 4887871 96.50939 AL
597 Alabama 2020-03-14 1 12 0 1 4887871 96.50939 AL
282 Alabama 2020-03-15 1 23 0 1 4887871 96.50939 AL
12 Alabama 2020-03-16 1 29 0 1 4887871 96.50939 AL
266 Alabama 2020-03-17 1 39 0 1 4887871 96.50939 AL
78 Alabama 2020-03-18 1 51 0 1 4887871 96.50939 AL
state date fips cases
Washington : 1158 Min. :2020-01-21 Min. : 1.00 Min. : 1
Illinois : 1155 1st Qu.:2020-12-06 1st Qu.:16.00 1st Qu.: 112125
California : 1154 Median :2021-09-11 Median :29.00 Median : 418120
Arizona : 1153 Mean :2021-09-10 Mean :29.78 Mean : 947941
Massachusetts: 1147 3rd Qu.:2022-06-17 3rd Qu.:44.00 3rd Qu.: 1134318
Wisconsin : 1143 Max. :2023-03-23 Max. :72.00 Max. :12169158
(Other) :51184
deaths geo_id population pop_density
Min. : 0 Min. : 1.00 Min. : 577737 Min. : 1.292
1st Qu.: 1598 1st Qu.:16.00 1st Qu.: 1805832 1st Qu.: 43.659
Median : 5901 Median :29.00 Median : 4468402 Median : 107.860
Mean : 12553 Mean :29.78 Mean : 6397965 Mean : 423.031
3rd Qu.: 15952 3rd Qu.:44.00 3rd Qu.: 7535591 3rd Qu.: 229.511
Max. :104277 Max. :72.00 Max. :39557045 Max. :11490.120
NA's :1106
abb
WA : 1158
IL : 1155
CA : 1154
AZ : 1153
MA : 1147
WI : 1143
(Other):51184
# Add new_cases and new_deaths and correct outliers
for (i in 1 : length (state_list)) {
cv_subset = subset (cv_states, state == state_list[i])
cv_subset = cv_subset[order (cv_subset$ date),]
cv_subset$ new_cases = cv_subset$ cases[1 ]
cv_subset$ new_deaths = cv_subset$ deaths[1 ]
for (j in 2 : nrow (cv_subset)) {
cv_subset$ new_cases[j] = cv_subset$ cases[j] - cv_subset$ cases[j - 1 ]
cv_subset$ new_deaths[j] = cv_subset$ deaths[j] - cv_subset$ deaths[j - 1 ]
}
cv_states$ new_cases[cv_states$ state== state_list[i]] = cv_subset$ new_cases
cv_states$ new_deaths[cv_states$ state== state_list[i]] = cv_subset$ new_deaths
}
cv_states <- cv_states %>% dplyr:: filter (date >= "2021-06-01" )
p1 <- ggplot (cv_states, aes (x = date, y = new_cases, color = state)) + geom_line () + geom_point (size = .5 , alpha = 0.5 )
ggplotly (p1)
p1 <- NULL
p2 <- ggplot (cv_states, aes (x = date, y = new_deaths, color = state)) + geom_line () + geom_point (size = .5 , alpha = 0.5 )
ggplotly (p2)
p2 <- NULL
cv_states$ new_cases[cv_states$ new_cases< 0 ] = 0
cv_states$ new_deaths[cv_states$ new_deaths< 0 ] = 0
for (i in 1 : length (state_list)) {
cv_subset = subset (cv_states, state == state_list[i])
cv_subset$ cases = cv_subset$ cases[1 ]
cv_subset$ deaths = cv_subset$ deaths[1 ]
for (j in 2 : nrow (cv_subset)) {
cv_subset$ cases[j] = cv_subset$ new_cases[j] + cv_subset$ cases[j - 1 ]
cv_subset$ deaths[j] = cv_subset$ new_deaths[j] + cv_subset$ deaths[j - 1 ]
}
cv_states$ cases[cv_states$ state == state_list[i]] = cv_subset$ cases
cv_states$ deaths[cv_states$ state == state_list[i]] = cv_subset$ deaths
}
cv_states$ new_cases = zoo:: rollmean (cv_states$ new_cases, k = 7 , fill = NA , align = 'right' ) %>% round (digits = 0 )
cv_states$ new_deaths = zoo:: rollmean (cv_states$ new_deaths, k = 7 , fill = NA , align = 'right' ) %>% round (digits = 0 )
p2 <- ggplot (cv_states, aes (x = date, y = new_deaths, color = state)) + geom_line () + geom_point (size = .5 , alpha = 0.5 )
ggplotly (p2)
# Add additional variables
cv_states$ per100k = as.numeric (format (round (cv_states$ cases/ (cv_states$ population/ 100000 ),1 ),nsmall= 1 ))
cv_states$ newper100k = as.numeric (format (round (cv_states$ new_cases/ (cv_states$ population/ 100000 ),1 ),nsmall= 1 ))
Warning: NAs introduced by coercion
cv_states$ deathsper100k = as.numeric (format (round (cv_states$ deaths/ (cv_states$ population/ 100000 ),1 ),nsmall= 1 ))
cv_states$ newdeathsper100k = as.numeric (format (round (cv_states$ new_deaths/ (cv_states$ population/ 100000 ),1 ),nsmall= 1 ))
Warning: NAs introduced by coercion
cv_states = cv_states %>% mutate (naive_CFR = round ((deaths* 100 / cases),2 ))
cv_states_today = subset (cv_states, date== max (cv_states$ date))
# Explore scatterplots using plot_ly()
p1 <- cv_states_today %>%
plot_ly (x = ~ pop_density, y = ~ cases,
type = 'scatter' , mode = 'markers' , color = ~ state,
size = ~ population, sizes = c (5 , 70 ), marker = list (sizemode= 'diameter' , opacity= 0.5 ))
cv_states_today_filter <- cv_states_today %>% filter (state != "District of Columbia" )
p2 <- cv_states_today_filter %>%
plot_ly (x = ~ pop_density, y = ~ cases,
type = 'scatter' , mode = 'markers' , color = ~ state,
size = ~ population, sizes = c (5 , 70 ), marker = list (sizemode= 'diameter' , opacity= 0.5 ))
p3 <- cv_states_today_filter %>%
plot_ly (x = ~ pop_density, y = ~ deathsper100k,
type = 'scatter' , mode = 'markers' , color = ~ state,
size = ~ population, sizes = c (5 , 70 ), marker = list (sizemode= 'diameter' , opacity= 0.5 ))
p3 <- cv_states_today_filter %>%
plot_ly (x = ~ pop_density, y = ~ deathsper100k,
type = 'scatter' , mode = 'markers' , color = ~ state,
size = ~ population, sizes = c (5 , 70 ), marker = list (sizemode= 'diameter' , opacity= 0.5 ),
hoverinfo = 'text' ,
text = ~ paste (paste (state, ":" , sep = "" ),
paste (" Cases per 100k: " , per100k, sep = "" ),
paste (" Deaths per 100k: " , deathsper100k, sep = "" ), sep = "<br>" )) %>%
layout (title = "Population-normalized COVID-19 deaths (per 100k) vs. population density for US states" ,
yaxis = list (title = "Deaths per 100k" ), xaxis = list (title = "Population Density" ),
hovermode = "compare" )
# Explore scatterplot trend interactively using ggplotly() and geom_smooth()
p <- ggplot (cv_states_today_filter, aes (x = pop_density, y = deathsper100k, size = population)) +
geom_point () +
geom_smooth (method = "lm" , se = FALSE , color = "blue" )
ggplotly (p)
`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 1 rows containing non-finite values (`stat_smooth()`).
Warning: The following aesthetics were dropped during statistical transformation: size
ℹ This can happen when ggplot fails to infer the correct grouping structure in
the data.
ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
variable into a factor?
# Multiple line chart
plot_ly (cv_states, x = ~ date, y = ~ naive_CFR, color = ~ state, type = "scatter" , mode = "lines" )
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
Returning the palette you asked for with that many colors
cv_states %>%
filter (state == "Florida" ) %>%
plot_ly (x = ~ date) %>%
add_lines (y = ~ new_cases, name = "New Cases" , line = list (color = 'blue' ), hoverinfo = "y+text" ,
text = ~ paste ("Date: " , date, "<br>New Cases: " , new_cases)) %>%
add_lines (y = ~ new_deaths, name = "New Deaths" , line = list (color = 'red' ), hoverinfo = "y+text" ,
text = ~ paste ("Date: " , date, "<br>New Deaths: " , new_deaths))
# Heatmaps
library (tidyr)
cv_states_mat <- cv_states %>% select (state, date, new_cases) %>% dplyr:: filter (date > as.Date ("2021-06-15" ))
cv_states_mat2 <- as.data.frame (pivot_wider (cv_states_mat, names_from = state, values_from = "new_cases" ))
rownames (cv_states_mat2) <- cv_states_mat2$ date
cv_states_mat2$ date <- NULL
cv_states_mat2 <- as.matrix (cv_states_mat2)
plot_ly (x = colnames (cv_states_mat2), y = rownames (cv_states_mat2),
z = ~ cv_states_mat2,
type = "heatmap" ,
showscale = TRUE )
cv_states_mat <- cv_states %>% select (state, date, newper100k) %>% dplyr:: filter (date > as.Date ("2021-06-15" ))
cv_states_mat2 <- as.data.frame (pivot_wider (cv_states_mat, names_from = state, values_from = "newper100k" ))
rownames (cv_states_mat2) <- cv_states_mat2$ date
cv_states_mat2$ date <- NULL
cv_states_mat2 <- as.matrix (cv_states_mat2)
plot_ly (x = colnames (cv_states_mat2), y = rownames (cv_states_mat2),
z = ~ cv_states_mat2,
type = "heatmap" ,
showscale = TRUE )
filter_dates <- seq (as.Date ("2021-06-15" ), as.Date ("2021-11-01" ), by = "2 weeks" )
cv_states_mat <- cv_states %>% select (state, date, newper100k) %>% filter (date %in% filter_dates)
cv_states_mat2 <- as.data.frame (pivot_wider (cv_states_mat, names_from = state, values_from = "newper100k" ))
rownames (cv_states_mat2) <- cv_states_mat2$ date
cv_states_mat2$ date <- NULL
cv_states_mat2 <- as.matrix (cv_states_mat2)
plot_ly (x = colnames (cv_states_mat2), y = rownames (cv_states_mat2),
z = ~ cv_states_mat2,
type = "heatmap" ,
showscale = TRUE )
# Map
pick.date = "2021-10-25"
cv_per100 <- cv_states %>% filter (date== pick.date) %>% select (state, abb, newper100k, cases, deaths) # select data
cv_per100$ state_name <- cv_per100$ state
cv_per100$ state <- cv_per100$ abb
cv_per100$ abb <- NULL
cv_per100$ hover <- with (cv_per100, paste (state_name, '<br>' , "Cases per 100k: " , newper100k, '<br>' , "Cases: " , cases, '<br>' , "Deaths: " , deaths))
set_map_details <- list (
scope = 'usa' ,
projection = list (type = 'albers usa' ),
showlakes = TRUE ,
lakecolor = toRGB ('white' )
)
shadeLimit <- 125
fig <- plot_geo (cv_per100, locationmode = 'USA-states' ) %>%
add_trace (
z = ~ newper100k, text = ~ hover, locations = ~ state,
color = ~ newper100k, colors = 'Purples'
)
fig <- fig %>% colorbar (title = paste0 ("Cases per 100k: " , pick.date), limits = c (0 ,shadeLimit))
fig <- fig %>% layout (
title = paste ('Cases per 100k by State as of ' , pick.date, '<br>(Hover for value)' ),
geo = set_map_details
)
fig_pick.date <- fig
cv_per100 <- cv_states_today %>% select (state, abb, newper100k, cases, deaths)
cv_per100$ state_name <- cv_per100$ state
cv_per100$ state <- cv_per100$ abb
cv_per100$ abb <- NULL
cv_per100$ hover <- with (cv_per100, paste (state_name, '<br>' , "Cases per 100k: " , newper100k, '<br>' , "Cases: " , cases, '<br>' , "Deaths: " , deaths))
set_map_details <- list (
scope = 'usa' ,
projection = list (type = 'albers usa' ),
showlakes = TRUE ,
lakecolor = toRGB ('white' )
)
fig <- plot_geo (cv_per100, locationmode = 'USA-states' ) %>%
add_trace (
z = ~ newper100k, text = ~ hover, locations = ~ state,
color = ~ newper100k, colors = 'Purples'
)
fig <- fig %>% colorbar (title = paste0 ("Cases per 100k: " , Sys.Date ()), limits = c (0 ,shadeLimit))
fig <- fig %>% layout (
title = paste ('Cases per 100k by State as of' , Sys.Date (), '<br>(Hover for value)' ),
geo = set_map_details
)
fig_Today <- fig
subplot (fig_pick.date, fig_Today, nrows = 2 , margin = .05 )